============================================================================================

Kmeans

============================================================================================

In [1]:
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)
In [2]:
import pandas as pd
import numpy as np
import os
import re
import random
import time
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.decomposition import PCA

import fns_models as fns


% matplotlib inline

from subprocess import check_output
print(check_output(["ls", "data"]).decode("utf-8"))
athenaeum_authors.csv
athenaeum_authors_preview.csv
athenaeum_painting_filtered.csv
athenaeum_painting_movement.csv
athenaeum_painting_movement_test.csv
athenaeum_painting_movement_train.csv
athenaeum_paintings.csv
athenaeum_paintings_sizes.csv
color_hist_kmeans_206552.csv
color_histograms.csv
color_hist_size_206552.csv
complete_data.csv
extra_tree_com.csv
grad_boost_com.csv
images
images_athenaeum
images_sizes_2325.csv
kmeans_centers.csv
knn_com.csv
model_accuracy.csv
movement_hist_test.csv
movement_hist_train.csv
nbc_com.csv
net_predicted.csv
painter_info_clean.csv
painting_info_clean.csv
pca20_kmeans_test.csv
pca20_kmeans_train.csv
resized_200
rf_com.csv
test_author200.csv
test_data.csv
test_hist_author_knn.csv
test_hist_author_rf.csv
train_author200.csv
train_data.csv
train_hist_author_knn.csv
train_hist_author_rf.csv
xgb_com.csv

In [67]:
def plot_columns(sample_painting):
    from PIL import Image
    if len(sample_painting) > 8:
        sample_painting = sample_painting.sample(8)
    elif len(sample_painting) == 0:
        print "[INFO]: No painting for this cluster!"
        return
    size = len(sample_painting)
    y = 1 if size <= 4 else 2
    x = size if y == 1 else (size + 1) // 2
    f, ax = plt.subplots(y, x, figsize = (20,15))
    for i in range(size):
        im = Image.open('data/images_athenaeum/full/%d/%d.jpg' % (sample_painting.iloc[i]['author_id'],
                                                                  sample_painting.iloc[i]['painting_id']))
        if size == 1:
            curAxis = ax
        elif y == 1:
            curAxis = ax[i]
        else:
            curAxis = ax[i / x, i % x]
        curAxis.imshow(im)
        curAxis.set_yticks([])
        curAxis.set_xticks([])
In [4]:
test_id = pd.read_csv('data/test_hist_author_knn.csv')
train_id = pd.read_csv('data/train_hist_author_knn.csv')
test_id.head(1)
Out[4]:
author_id painting_id hist_01 hist_02 hist_03 hist_04 hist_05 hist_06 hist_07 hist_08 ... hist_24 hist_25 hist_26 hist_27 hist_28 hist_29 hist_30 height_px width_px height_width_ratio
0 444 12077 7603 1687 587 348 403 538 671 959 ... 69539 24143 46874 97427 117568 102332 79799 555 800 0.69375

1 rows × 35 columns

In [5]:
color_hist = pd.read_csv('data/color_hist_kmeans_206552.csv')
color_hist.head(3)
Out[5]:
author_id painting_id hist_01 hist_02 hist_03 hist_04 hist_05 hist_06 hist_07 hist_08 ... hist_23 hist_24 hist_25 hist_26 hist_27 hist_28 hist_29 hist_30 height_width_ratio kmeans_labels
0 444 12077 7603 1687 587 348 403 538 671 959 ... 99139 69539 24143 46874 97427 117568 102332 79799 0.693750 0
1 444 11653 141479 107241 3576 1031 1010 3886 568 2650 ... 144006 99116 46843 25371 63071 67621 69665 480656 0.779412 3
2 444 12097 1429 713 600 581 1362 3304 8280 16148 ... 174675 107953 51343 77403 128000 147619 172075 150541 1.310585 0

3 rows × 34 columns

In [6]:
print train_id.shape
print test_id.shape
print color_hist.shape
(49890, 35)
(12473, 35)
(206552, 34)
In [7]:
# find the train, test data for the tsne
train_tsne = color_hist.merge(pd.DataFrame(train_id.iloc[:, 1]), how='inner', on='painting_id')
test_tsne = color_hist.merge(pd.DataFrame(test_id.iloc[:, 1]), how='inner', on='painting_id')
print train_tsne.shape
print test_tsne.shape

train_tsne.head(1)
(49890, 34)
(12473, 34)
Out[7]:
author_id painting_id hist_01 hist_02 hist_03 hist_04 hist_05 hist_06 hist_07 hist_08 ... hist_23 hist_24 hist_25 hist_26 hist_27 hist_28 hist_29 hist_30 height_width_ratio kmeans_labels
0 444 11653 141479 107241 3576 1031 1010 3886 568 2650 ... 144006 99116 46843 25371 63071 67621 69665 480656 0.779412 3

1 rows × 34 columns

In [17]:
# filtered_out = ['height_px', 'width_px']
# color_hist = color_hist.drop(filtered_out, axis=1)
color_hist['kmeans_labels'] = kmeans.labels_
print color_hist.shape
color_hist.to_csv('data/color_hist_kmeans_206552.csv', index=False)
(206552, 34)
In [18]:
color_hist.iloc[:,2:-2] = color_hist.iloc[:, 2:-2]\
        .apply(lambda x: x.astype(np.float) / (x.sum()/3), axis = 1, raw = True)

# movement_hist_test.iloc[:,3:-1] = movement_hist_test.iloc[:, 3:-1]\
#             .apply(lambda x: x.astype(np.float) / (x.sum()/3), axis = 1, raw = True)
In [19]:
color_hist.head(3)
Out[19]:
author_id painting_id hist_01 hist_02 hist_03 hist_04 hist_05 hist_06 hist_07 hist_08 ... hist_23 hist_24 hist_25 hist_26 hist_27 hist_28 hist_29 hist_30 height_width_ratio kmeans_labels
0 444 12077 0.017124 0.003800 0.001322 0.000784 0.000908 0.001212 0.001511 0.002160 ... 0.223286 0.156619 0.054376 0.105572 0.219430 0.264793 0.230477 0.179727 0.693750 6
1 444 11653 0.200286 0.151817 0.005062 0.001460 0.001430 0.005501 0.000804 0.003752 ... 0.203864 0.140315 0.066314 0.035917 0.089287 0.095728 0.098622 0.680446 0.779412 0
2 444 12097 0.002115 0.001055 0.000888 0.000860 0.002016 0.004890 0.012255 0.023900 ... 0.258533 0.159779 0.075992 0.114563 0.189451 0.218488 0.254685 0.222813 1.310585 2

3 rows × 34 columns

In [14]:
%%time
# prepare Kmeans data
kmeans = KMeans(n_init = 100, n_jobs=4)
kmeans.set_params(n_clusters=7)
kmeans.fit(color_hist.iloc[:, 2:-1])
CPU times: user 10.6 s, sys: 316 ms, total: 10.9 s
Wall time: 2min 59s
In [15]:
print (kmeans.labels_).shape
kmeans.labels_
(206552,)
Out[15]:
array([6, 0, 2, ..., 6, 4, 6], dtype=int32)
In [16]:
color_hist.iloc[:, 2:].columns
Out[16]:
Index([u'hist_01', u'hist_02', u'hist_03', u'hist_04', u'hist_05', u'hist_06',
       u'hist_07', u'hist_08', u'hist_09', u'hist_10', u'hist_11', u'hist_12',
       u'hist_13', u'hist_14', u'hist_15', u'hist_16', u'hist_17', u'hist_18',
       u'hist_19', u'hist_20', u'hist_21', u'hist_22', u'hist_23', u'hist_24',
       u'hist_25', u'hist_26', u'hist_27', u'hist_28', u'hist_29', u'hist_30',
       u'height_width_ratio', u'kmeans_labels'],
      dtype='object')
In [20]:
kmeans.cluster_centers_.shape
Out[20]:
(7, 31)
In [42]:
# pd.DataFrame(kmeans.cluster_centers_).to_csv('data/kmeans_centers.csv', index=False)
In [71]:
def get_paintings_around_centroid(centroid, color_hist, num_paintings):
    distances = color_hist.iloc[:, 2:-1].apply(lambda row: sum((row - centroid) ** 2), raw = True, axis = 1)
    return color_hist.loc[distances.sort_values()[:num_paintings].index, ['painting_id', 'author_id']]
In [21]:
def get_paintings_around_centroid(centroid, color_hist, num_paintings):
    distances = color_hist.iloc[:, 2:-1].apply(lambda row: sum((row - centroid) ** 2), raw = True, axis = 1)
    return color_hist.loc[distances.nsmallest(num_paintings).index, ['painting_id', 'author_id']]
In [22]:
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[0], color_hist, 4))
CPU times: user 1.52 s, sys: 56 ms, total: 1.57 s
Wall time: 1.82 s
In [23]:
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[1], color_hist, 4))
CPU times: user 1.57 s, sys: 52 ms, total: 1.62 s
Wall time: 1.93 s
In [24]:
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[2], color_hist, 4))
CPU times: user 1.53 s, sys: 48 ms, total: 1.58 s
Wall time: 1.88 s
In [25]:
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[3], color_hist, 4))
CPU times: user 1.43 s, sys: 80 ms, total: 1.51 s
Wall time: 1.65 s
In [27]:
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[4], color_hist, 4))
CPU times: user 1.71 s, sys: 64 ms, total: 1.77 s
Wall time: 1.98 s
In [28]:
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[5], color_hist, 4))
CPU times: user 1.44 s, sys: 52 ms, total: 1.5 s
Wall time: 1.84 s
In [29]:
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[6], color_hist, 4))
CPU times: user 1.47 s, sys: 36 ms, total: 1.5 s
Wall time: 1.78 s
In [ ]:
 
In [30]:
def plot_columns_kmeans(centroids, color_hist, num_per_cluster, art_movements = None):
    base_dim = 18
    from PIL import Image
    f, ax = plt.subplots(centroids.shape[0], num_per_cluster,
                         figsize = (base_dim, base_dim * centroids.shape[0] / num_per_cluster))
    for y, centroid in enumerate(centroids):
        paintings = get_paintings_around_centroid(centroid, color_hist, num_per_cluster)
        if art_movements is not None:
            paintings = paintings.merge(art_movements[['author_id', 'painting_id', 'sup_art_movement']], how = 'left',
                                        on = ['author_id', 'painting_id'])
        for i in range(len(paintings)):
            im = Image.open('data/images_athenaeum/full/%d/%d.jpg' % (paintings.iloc[i]['author_id'],
                                                                  paintings.iloc[i]['painting_id']))
            curAxis = ax[y, i] if num_per_cluster > 1 else ax[y]
            curAxis.imshow(im)
            curAxis.set_yticks([])
            curAxis.set_xticks([])
            if art_movements is not None:
                curAxis.set_title(paintings.iloc[i]['sup_art_movement'])
            if i == 0:
                curAxis.set_ylabel('cluster #%d' % y)
In [31]:
color_hist.head(1)
Out[31]:
author_id painting_id hist_01 hist_02 hist_03 hist_04 hist_05 hist_06 hist_07 hist_08 ... hist_23 hist_24 hist_25 hist_26 hist_27 hist_28 hist_29 hist_30 height_width_ratio kmeans_labels
0 444 12077 0.017124 0.0038 0.001322 0.000784 0.000908 0.001212 0.001511 0.00216 ... 0.223286 0.156619 0.054376 0.105572 0.21943 0.264793 0.230477 0.179727 0.69375 6

1 rows × 34 columns

In [32]:
%%time
plot_columns_kmeans(kmeans.cluster_centers_, color_hist, 8, art_movements=pd.read_csv('data/athenaeum_painting_movement.csv'))
CPU times: user 13.1 s, sys: 520 ms, total: 13.6 s
Wall time: 15.9 s
In [ ]:
 
In [40]:
 
In [41]:
 
In [26]:
# calculate the distance
kmeans_centers = pd.read_csv('data/kmeans_centers.csv')
kmeans_centers.head(1)
Out[26]:
0 1 2 3 4 5 6 7 8 9 ... 22 23 24 25 26 27 28 29 30 31
0 24830.953822 22836.710172 15404.427763 8597.501116 8084.719427 8002.708462 6372.687827 9542.462058 14894.858453 33008.370668 ... 116874.716528 70462.102712 38515.344498 97276.180586 128723.37147 110653.678089 105103.85313 86111.118323 1.034743 5.960276

1 rows × 32 columns

In [33]:
color_hist[color_hist['kmeans_labels'] == 0].iloc[:, 2:-1].shape
Out[33]:
(30984, 31)
In [ ]:
 
In [ ]:
 

TSNE for cluster plotting

In [ ]:
# Memory error
tsne = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
tsne.fit_transform(color_hist.iloc[:, 1:]) 
In [34]:
tsne_data = pd.concat([train_tsne.reset_index(drop=True), test_tsne], axis=0)
print tsne_data.shape
tsne_data.head(1)
(62363, 34)
Out[34]:
author_id painting_id hist_01 hist_02 hist_03 hist_04 hist_05 hist_06 hist_07 hist_08 ... hist_23 hist_24 hist_25 hist_26 hist_27 hist_28 hist_29 hist_30 height_width_ratio kmeans_labels
0 444 11653 141479 107241 3576 1031 1010 3886 568 2650 ... 144006 99116 46843 25371 63071 67621 69665 480656 0.779412 3

1 rows × 34 columns

In [36]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
tsne_trainsformed = tsne.fit_transform(tsne_data.iloc[:, 2:].sample(5000,random_state = 123)) 
In [37]:
tsne_data.head(1)
Out[37]:
author_id painting_id hist_01 hist_02 hist_03 hist_04 hist_05 hist_06 hist_07 hist_08 ... hist_23 hist_24 hist_25 hist_26 hist_27 hist_28 hist_29 hist_30 height_width_ratio kmeans_labels
0 444 11653 141479 107241 3576 1031 1010 3886 568 2650 ... 144006 99116 46843 25371 63071 67621 69665 480656 0.779412 3

1 rows × 34 columns

In [40]:
from time import time
from sklearn.manifold import TSNE

# Perform t-distributed stochastic neighbor embedding.
t0 = time()
tsne = TSNE(n_components=2, init='pca', random_state=2017)
trans_data = tsne.fit_transform(tsne_data.iloc[:, 2:].sample(5000)).T
t1 = time()
print("t-SNE: %.2g sec" % (t1 - t0))

ax = plt.figure(figsize=(15, 8))
t-SNE: 34 sec
<matplotlib.figure.Figure at 0x7efc4229f190>
In [ ]:
tsne_trainsformed
In [43]:
colors = tsne_data.iloc[:, -1].sample(5000,random_state = 123)
fig = plt.figure(figsize=(18, 10))

plt.scatter(tsne_trainsformed[:,0], tsne_trainsformed[:,1],
            c=np.array(colors), 
            cmap=plt.cm.rainbow)
plt.title("t-SNE (%.2g sec)" % (t1 - t0))
plt.axis('normal')
plt.show()
fig.savefig('data/kmeans.png', dpi=fig.dpi)
In [44]:
cluster_data = tsne_data.sample(5000,random_state = 123)
cluster_data.groupby(['painting_id', 'kmeans_labels']).agg('sum').head(3)
Out[44]:
author_id hist_01 hist_02 hist_03 hist_04 hist_05 hist_06 hist_07 hist_08 hist_09 ... hist_22 hist_23 hist_24 hist_25 hist_26 hist_27 hist_28 hist_29 hist_30 height_width_ratio
painting_id kmeans_labels
46 0 3 4632 17697 27873 8045 5106 7108 34031 80487 50199 ... 254968 57193 44279 145866 33685 234707 87567 121491 221750 1.092500
57 0 3 3385 948 330 214 289 323 340 737 1520 ... 73072 96344 107459 2401 14914 112861 71211 68789 45825 1.562500
66 0 104 27102 38535 84182 70597 31375 16909 11903 13154 18384 ... 228506 112787 44264 42119 4744 29458 77529 172329 373081 0.726604

3 rows × 32 columns

In [45]:
cluster_data[cluster_data['author_id'] == 24]['kmeans_labels'].value_counts()
Out[45]:
4    68
0    48
3    17
2    11
Name: kmeans_labels, dtype: int64
In [46]:
authors = pd.read_csv('data/athenaeum_authors.csv')
sum(authors.first_name == 'Vincent')
Out[46]:
2
In [47]:
authors[authors.last_name == 'Gogh']
Out[47]:
first_name last_name death_year bio_url nationality art_movement author_id bio_info birth_year
4 Vincent Willem van Gogh 1890.0 http://www.the-athenaeum.org/people/detail.php... Dutch Post-Impressionist 789 Dutch Post-Impressionist painter who remains o... 1853
In [48]:
authors[authors.last_name == 'Monet']
Out[48]:
first_name last_name death_year bio_url nationality art_movement author_id bio_info birth_year
1 Oscar-Claude Monet 1926.0 http://www.the-athenaeum.org/people/detail.php... French Impressionist 13 French Impressionist painter (Paris, 14 Novemb... 1840

Van Gogh Analysis

In [49]:
van_gogh_data = cluster_data[cluster_data['author_id'] == 789][['author_id', 'painting_id' ,'kmeans_labels']]
van_gogh_data.head(5)
Out[49]:
author_id painting_id kmeans_labels
14841 789 38153 4
38596 789 190598 3
15365 789 15190 0
15024 789 186735 0
15261 789 15118 2
In [50]:
van_gogh_data.iloc[:, 1:].groupby(['kmeans_labels']).agg('count').reset_index().rename(columns={'painting_id':'painting_num'})
Out[50]:
kmeans_labels painting_num
0 0 23
1 1 2
2 2 7
3 3 18
4 4 16
5 6 1
In [51]:
van_gogh_clusters = {}
for i in range(7):
    name = 'van_gogh_%d' % i
    van_gogh_clusters[name] = van_gogh_data[van_gogh_data['kmeans_labels'] == i]
    
# van_gogh_clusters
In [52]:
plot_columns(van_gogh_clusters['van_gogh_0'])
In [53]:
plot_columns(van_gogh_clusters['van_gogh_1'])
In [68]:
plot_columns(van_gogh_clusters['van_gogh_2'])
In [69]:
plot_columns(van_gogh_clusters['van_gogh_3'])
In [70]:
plot_columns(van_gogh_clusters['van_gogh_4'])
In [72]:
# No cluster 5 for van gogh
plot_columns(van_gogh_clusters['van_gogh_5'])
[INFO]: No painting for this cluster!
In [73]:
plot_columns(van_gogh_clusters['van_gogh_6'])
In [59]:
van_gogh_6 = van_gogh_data[van_gogh_data['kmeans_labels'] == 6]
van_gogh_6.head(3)
Out[59]:
author_id painting_id kmeans_labels
15241 789 192228 6

Monet Analysis

In [60]:
Monet_data = cluster_data[cluster_data['author_id'] == 13][['author_id', 'painting_id' ,'kmeans_labels']]
Monet_data.head(3)
Out[60]:
author_id painting_id kmeans_labels
3301 13 3642 0
894 13 3197 0
2797 13 3676 0
In [61]:
Monet_data.iloc[:, 1:].groupby(['kmeans_labels']).agg('count').reset_index().rename(columns={'painting_id':'painting_num'})
Out[61]:
kmeans_labels painting_num
0 0 51
1 2 6
2 3 25
3 4 11
4 6 1
In [62]:
monet_clusters = {}
for i in range(7):
    name = 'monet_%d' % i
    monet_clusters[name] = Monet_data[Monet_data['kmeans_labels'] == i]
    
# monet_clusters    
In [74]:
plot_columns(monet_clusters['monet_0'])
In [75]:
plot_columns(monet_clusters['monet_1'])
[INFO]: No painting for this cluster!
In [76]:
plot_columns(monet_clusters['monet_2'])
In [77]:
plot_columns(monet_clusters['monet_3'])
In [78]:
plot_columns(monet_clusters['monet_4'])
In [80]:
plot_columns(monet_clusters['monet_5'])
[INFO]: No painting for this cluster!
In [81]:
plot_columns(monet_clusters['monet_6'])
In [ ]:
 

clusters for paintings

In [82]:
cluster_data.head(2)
Out[82]:
author_id painting_id hist_01 hist_02 hist_03 hist_04 hist_05 hist_06 hist_07 hist_08 ... hist_23 hist_24 hist_25 hist_26 hist_27 hist_28 hist_29 hist_30 height_width_ratio kmeans_labels
43285 1341 214204 373 99 3 1 2 57 0 32 ... 92777 86512 44728 101131 91366 96644 19199 2460 1.158301 0
27804 495 32439 138994 145282 4899 1682 2535 6230 1156 1682 ... 249249 193940 65321 411998 186697 88271 57984 48970 1.160822 2

2 rows × 34 columns

In [83]:
cluster_data[['painting_id', 'kmeans_labels']].groupby('kmeans_labels').agg('count').rename(columns={'painting_id': 'painting_num'})
Out[83]:
painting_num
kmeans_labels
0 2193
1 103
2 554
3 1017
4 967
5 61
6 105
In [84]:
cluster_data[cluster_data['kmeans_labels'] == 0].head(3)
Out[84]:
author_id painting_id hist_01 hist_02 hist_03 hist_04 hist_05 hist_06 hist_07 hist_08 ... hist_23 hist_24 hist_25 hist_26 hist_27 hist_28 hist_29 hist_30 height_width_ratio kmeans_labels
43285 1341 214204 373 99 3 1 2 57 0 32 ... 92777 86512 44728 101131 91366 96644 19199 2460 1.158301 0
40618 5401 158450 3204 1282 580 320 382 434 221 453 ... 261641 85671 4142 29736 151309 293514 282397 5044 1.312336 0
43432 1829 154333 18733 13129 8100 6689 9046 18704 32766 52468 ... 36589 5909 3493 89133 337258 201451 57459 33739 0.780208 0

3 rows × 34 columns

In [85]:
paintings_clusters = {}
for i in range(7):
    name = 'cluster_%d' % i
    paintings_clusters[name] = cluster_data[cluster_data['kmeans_labels'] == i]
    
# paintings_clusters  
In [86]:
plot_columns(paintings_clusters['cluster_0'])
In [87]:
plot_columns(paintings_clusters['cluster_1'])
In [88]:
plot_columns(paintings_clusters['cluster_2'])
In [89]:
plot_columns(paintings_clusters['cluster_3'])
In [90]:
plot_columns(paintings_clusters['cluster_4'])
In [91]:
plot_columns(paintings_clusters['cluster_5'])
In [92]:
plot_columns(paintings_clusters['cluster_6'])
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
color_hist.loc[:,'painting_id']
distances.nsmallest(4).index